Introduction

We will investigate protein-protein interactions (PPIs) for a given list of proteins. To do so, we will retreive PPI data from the STRING database and visualize networks in Cytoscape using the R package RCy3. We will further cluster the network to investigate whether subsets of proteins share a particular molecular function.

Requirements

Cytoscape running, with the clusterMaker app installed.

Load libraries

library(STRINGdb) 
library(RCy3)
library(dplyr)
library(purrr)
library(tidyr)
library(readxl)
library(readr)
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
# Confirm that Cytoscape is running
cytoscapeVersionInfo ()
##       apiVersion cytoscapeVersion 
##             "v1"         "3.10.2"

Load protein list

proteins <- readLines("data/gene_symbols_list.txt")
head(proteins)
## [1] "IFI30"    "IFITM2"   "FGL2"     "STON1"    "FAM129A"  "SERPINA3"

Initialize STRING db

# Initialize STRINGdb object 
string_db <- STRINGdb$new(version="12.0", 
                          species=9606, # Homo sapiens
                          score_threshold=400, 
                          input_directory="")

Map gene names to STRING aliases

# Create a dataframe with uppercase letters (to match with STRING alias mapping)
data <- data.frame(query.term = toupper(proteins)) 

# check if file with mapping to STRING aliases exists
# and create it if it doesn't
if (file.exists("data/mapped2STRING.csv")) {
  
  mapped <- read.csv("data/mapped2STRING.csv")
  
} else {
  
mapped <- string_db$map(my_data_frame = data, 
                        my_data_frame_id_col_names = "query.term", 
                        takeFirst=TRUE, 
                        removeUnmappedRows=TRUE, 
                        quiet=FALSE)
write.csv(x = mapped, file = "data/mapped2STRING.csv", quote = FALSE, row.names = FALSE)

}

head(mapped)
##   query.term            STRING_id
## 1      IFI30 9606.ENSP00000384886
## 2     IFITM2 9606.ENSP00000484689
## 3       FGL2 9606.ENSP00000248598
## 4      STON1 9606.ENSP00000384615
## 5    FAM129A 9606.ENSP00000356481
## 6   SERPINA3 9606.ENSP00000450540
# Check how many proteins were not mapped
# Since some gene IDs can map to several STRING identifiers, we account for duplicates 
length(unique(data$query.term)) - length(unique(mapped$query.term))
## [1] 1

Fetch interaction data

# Get interactions for the mapped proteins if the file isn't already there

# check if file with interaction data exists 
# and create it if it doesn't
if (file.exists("data/interactions.csv")) {
  
  interactions <- read.csv("data/interactions.csv")
  
} else {
  
interactions <- string_db$get_interactions(mapped$STRING_id)
write.csv(x = interactions, file = "data/interactions.csv", quote = FALSE, row.names = FALSE)
}

# View the first few interactions 
head(interactions)
##                   from                   to combined_score
## 1 9606.ENSP00000162749 9606.ENSP00000216117            435
## 2 9606.ENSP00000162749 9606.ENSP00000216117            435
## 3 9606.ENSP00000009530 9606.ENSP00000225831            497
## 4 9606.ENSP00000009530 9606.ENSP00000225831            497
## 5 9606.ENSP00000162749 9606.ENSP00000225831            768
## 6 9606.ENSP00000162749 9606.ENSP00000225831            768

Create dataframe for PPI network

Define nodes for the network

# Get node columns for the Cytoscape network
nodes <- data.frame(id = unique(c(interactions$from, interactions$to)))

# Merge with the mapped protein names to include original protein names as labels
nodes <- merge(nodes, mapped[, c("STRING_id", "query.term")], 
               by.x = "id", by.y = "STRING_id", 
               all.x = TRUE, 
               all.y = TRUE)

head(nodes)
##                     id query.term
## 1 9606.ENSP00000009530       CD74
## 2 9606.ENSP00000162749   TNFRSF1A
## 3 9606.ENSP00000206423     CCDC80
## 4 9606.ENSP00000209929       FMO2
## 5 9606.ENSP00000216117      HMOX1
## 6 9606.ENSP00000225831       CCL2

Define edges for the network

edges <- data.frame(source = interactions$from, target = interactions$to)

# Remove directionality of edges (necessary for various clusterings later)
# Combine and sort the columns, then get unique rows
unique_edges <- unique(t(apply(edges, 1, function(x) sort(x))))

# Convert back to a dataframe
edges <- as.data.frame(unique_edges, stringsAsFactors = FALSE)

# Rename the columns if needed
colnames(edges) <- c("source", "target")

head(edges)
##                 source               target
## 1 9606.ENSP00000162749 9606.ENSP00000216117
## 2 9606.ENSP00000009530 9606.ENSP00000225831
## 3 9606.ENSP00000162749 9606.ENSP00000225831
## 4 9606.ENSP00000216117 9606.ENSP00000225831
## 5 9606.ENSP00000225831 9606.ENSP00000245907
## 6 9606.ENSP00000245907 9606.ENSP00000246006

Make the network in Cytoscape

# Connect to Cytoscape
cytoscapePing()
## You are connected to Cytoscape!
# Create a new Cytoscape network from your data
createNetworkFromDataFrames(nodes = nodes, 
                            edges = edges, 
                            # title = mytitle, 
                            collection = "My Collection")
## Loading data...
## Applying default style...
## Applying preferred layout...
## networkSUID 
##        9707
# Set node labels to the original protein names, and other visual tweaks
setNodeLabelMapping('query.term')
## style.name not specified, so updating "default" style.
## NULL
setNodeShapeDefault('ELLIPSE')
## style.name not specified, so updating "default" style.
setNodeColorDefault('#9fbcda')
## style.name not specified, so updating "default" style.

Now we modify the network layout in the app to reduce overlap of nodes and make it overall more aesthetically pleasing. Parameters will depend on the network

# using force-directed layout. Higher coefficients means nodes are closer together. 
layoutNetwork('force-directed defaultSpringCoefficient=0.000006 defaultSpringLength=1')

Save and View the network

# Make sure network fits in the frame to be saved 
fitContent()

#save network image
exportImage(filename = "currentnetwork.png") 
##                                                                    file 
## "/Users/inikaprasad/Desktop/Biostats2024/func-omics/currentnetwork.png"
# view network image
knitr::include_graphics("currentnetwork.png") 

Cluster PPI network

commands specify: restoreEdges: restores edges after clustering, showUI: displays the new network, and undirectedEdges: assumes edges are undirected

GLay clustering

Run GLay community clustering

# Run GLay community clustering
RCy3::commandsRun("cluster glay restoreEdges= true showUI = true undirectedEdges = true")
## [1] "  Clusters: 14"      "Average size: 4,429" "Maximum size: 18"   
## [4] "Minimum size: 1"     "Modularity: 0,589"
# reduce overlaps by making the nodes less inclined to be close to each other
layoutNetwork('force-directed defaultSpringCoefficient=0.000006 defaultSpringLength=1')

Visualize network after clustering

# Make sure network fits in the frame to be saved 
fitContent()

#save network image
exportImage(filename = "currentnetwork_clustered.png") 
##                                                                              file 
## "/Users/inikaprasad/Desktop/Biostats2024/func-omics/currentnetwork_clustered.png"
# view network image
knitr::include_graphics("currentnetwork_clustered.png") 

Save network info

# Get the table with clustering results 
network_table <- getTableColumns()
clusterinfo <- table(network_table$`__glayCluster`)
clusterinfo
## 
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 
## 18 18 14  1  1  2  1  1  1  1  1  1  1  1

Members of each big cluster

# Save the members of clusters with >5 nodes for subsequent analysis 
bigclusters <- names(clusterinfo)[(table(network_table$`__glayCluster`) > 5)]
bigclusternames <- paste0("cluster", bigclusters)

# empty list of clusters to cycle through
clusterlist <- list()

for (i in 1:length(bigclusters)){
  
  # create given clustername
  given_clustername <- bigclusternames[i]
  
  # make a list of these lists 
  clusterlist[[i]] <- network_table$query.term[network_table$`__glayCluster` == bigclusters[i]]
}

names(clusterlist) <- bigclusternames

Members of each cluster

clusterlist
## $cluster1
##  [1] "TNFRSF1A"  "CD93"      "EMP1"      "ICAM1"     "FCGR2A"    "S100A11"  
##  [7] "VCAM1"     "TLR3"      "TNFRSF11B" "OLR1"      "ANXA2"     "FCGR1A"   
## [13] "IL13RA1"   "IL1R1"     "CD44"      "MSR1"      "ANGPT1"    "MYD88"    
## 
## $cluster2
##  [1] "CD74"     "FGL2"     "RARRES3"  "LYZ"      "FCER1G"   "CCR1"    
##  [7] "MS4A7"    "C1QB"     "MS4A4A"   "NCF2"     "FCGR3A"   "CYBB"    
## [13] "IFI30"    "CD14"     "HLA-DPA1" "MS4A6A"   "PTAFR"    "IFITM2"  
## 
## $cluster3
##  [1] "HMOX1"    "CCL2"     "C3"       "GFAP"     "CP"       "SCIN"    
##  [7] "C7"       "TREM2"    "STEAP3"   "C4A"      "C4B"      "SERPINA1"
## [13] "SERPINA3" "APLNR"

Enrichment analysis for each cluster (GO Biological Process)

library(clusterProfiler)
library(org.Hs.eg.db)

# Run enrichment analysis on each cluster 
GO_BP_enrichments <- lapply(clusterlist, function(cluster) {
  
  enrichGO(gene = cluster,
           OrgDb = org.Hs.eg.db,
           keyType = "SYMBOL",
           ont = "BP",
           pAdjustMethod = "BH",
           qvalueCutoff = 0.01)
  
})

Create dotplots for results

for (x in 1:length(GO_BP_enrichments)){
  
  # pick out object with enrichments 
  enrichresult <- GO_BP_enrichments[[x]]
  
  # make dotplot object 
  p <- dotplot(object = enrichresult) + 
    ggtitle(paste0("ORA BP for ", names(clusterlist)[x]))
  
  print(p)
}

Enrichment analysis for each cluster (GO Molecular Function)

# Run enrichment analysis on each cluster 
GO_MF_enrichments <- lapply(clusterlist, function(cluster) {
  
  enrichGO(gene = cluster,
           OrgDb = org.Hs.eg.db,
           keyType = "SYMBOL",
           ont = "MF",
           pAdjustMethod = "BH",
           qvalueCutoff = 0.01)
  
})

Create dotplots for results

for (x in 1:length(GO_MF_enrichments)){
  
  # pick out object with enrichments 
  enrichresult <- GO_MF_enrichments[[x]]
  
  # make dotplot object 
  p <- dotplot(object = enrichresult) + 
    ggtitle(paste0("ORA MF for ", names(clusterlist)[x]))
  
  print(p)
}

Enrichment analysis for each cluster (GO Cellular Component)

library(clusterProfiler)
library(org.Hs.eg.db)

# Run enrichment analysis on each cluster 
GO_CC_enrichments <- lapply(clusterlist, function(cluster) {
  
  enrichGO(gene = cluster,
           OrgDb = org.Hs.eg.db,
           keyType = "SYMBOL",
           ont = "CC",
           pAdjustMethod = "BH",
           qvalueCutoff = 0.01)
  
})

Create dotplots for results

for (x in 1:length(GO_CC_enrichments)){
  
  # pick out object with enrichments 
  enrichresult <- GO_CC_enrichments[[x]]
  
  # make dotplot object 
  p <- dotplot(object = enrichresult) + 
    ggtitle(paste0("ORA CC for ", names(clusterlist)[x]))
  
  print(p)
}